{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cc22f97c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "eed27ad8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f1b0de58145a484a9a59563d5d4fc72c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from DPF import ShardsDatasetConfig, DatasetReader\n",
    "\n",
    "config = ShardsDatasetConfig.from_path_and_columns(\n",
    "    'example_dataset',\n",
    "    image_name_col='image_name',\n",
    "    text_col=\"caption\"\n",
    ")\n",
    "\n",
    "reader = DatasetReader()\n",
    "processor = reader.read_from_config(config)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18855d10",
   "metadata": {},
   "source": [
    "# Text filters"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b824b14",
   "metadata": {},
   "source": [
    "## language detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ff0de3b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Columns will be added: ['lang', 'lang_score']\n",
      "INFO: Pandarallel will run on 16 workers.\n",
      "INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"
     ]
    }
   ],
   "source": [
    "from DPF.filters.texts.lang_filter import LangFilter\n",
    "\n",
    "column_filter = LangFilter(\n",
    "    text_column_name='text',\n",
    ")\n",
    "print('Columns will be added:', column_filter.result_columns)\n",
    "\n",
    "processor.apply_column_filter(column_filter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fad30dd6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_path</th>\n",
       "      <th>text</th>\n",
       "      <th>split_name</th>\n",
       "      <th>lang</th>\n",
       "      <th>lang_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>example_dataset/0.tar/0.jpg</td>\n",
       "      <td>шотландцы в национальной одежде с флагом. - ba...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>example_dataset/0.tar/1.jpg</td>\n",
       "      <td>королевская золотая корона с драгоценностями н...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>example_dataset/0.tar/2.jpg</td>\n",
       "      <td>Пасха</td>\n",
       "      <td>0</td>\n",
       "      <td>be</td>\n",
       "      <td>0.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>example_dataset/0.tar/3.jpg</td>\n",
       "      <td>округа нью-джерси карта печати - elizabeth sto...</td>\n",
       "      <td>0</td>\n",
       "      <td>uk</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>example_dataset/0.tar/4.jpg</td>\n",
       "      <td>английская золотая корона с драгоценностями, и...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>495</th>\n",
       "      <td>example_dataset/2.tar/495.jpg</td>\n",
       "      <td>bokeh Background</td>\n",
       "      <td>2</td>\n",
       "      <td>de</td>\n",
       "      <td>0.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>example_dataset/2.tar/496.jpg</td>\n",
       "      <td>Healthcare, Medicine,</td>\n",
       "      <td>2</td>\n",
       "      <td>ro</td>\n",
       "      <td>0.96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>example_dataset/2.tar/497.jpg</td>\n",
       "      <td>A set of business teams icons that include edi...</td>\n",
       "      <td>2</td>\n",
       "      <td>en</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>example_dataset/2.tar/498.jpg</td>\n",
       "      <td>Vector floral pattern icon collection</td>\n",
       "      <td>2</td>\n",
       "      <td>es</td>\n",
       "      <td>0.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>example_dataset/2.tar/499.jpg</td>\n",
       "      <td>Set of Brown ribbons, banners, badges and labe...</td>\n",
       "      <td>2</td>\n",
       "      <td>en</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        image_path  \\\n",
       "0      example_dataset/0.tar/0.jpg   \n",
       "1      example_dataset/0.tar/1.jpg   \n",
       "2      example_dataset/0.tar/2.jpg   \n",
       "3      example_dataset/0.tar/3.jpg   \n",
       "4      example_dataset/0.tar/4.jpg   \n",
       "..                             ...   \n",
       "495  example_dataset/2.tar/495.jpg   \n",
       "496  example_dataset/2.tar/496.jpg   \n",
       "497  example_dataset/2.tar/497.jpg   \n",
       "498  example_dataset/2.tar/498.jpg   \n",
       "499  example_dataset/2.tar/499.jpg   \n",
       "\n",
       "                                                  text split_name lang  \\\n",
       "0    шотландцы в национальной одежде с флагом. - ba...          0   ru   \n",
       "1    королевская золотая корона с драгоценностями н...          0   ru   \n",
       "2                                                Пасха          0   be   \n",
       "3    округа нью-джерси карта печати - elizabeth sto...          0   uk   \n",
       "4    английская золотая корона с драгоценностями, и...          0   ru   \n",
       "..                                                 ...        ...  ...   \n",
       "495                                   bokeh Background          2   de   \n",
       "496                              Healthcare, Medicine,          2   ro   \n",
       "497  A set of business teams icons that include edi...          2   en   \n",
       "498              Vector floral pattern icon collection          2   es   \n",
       "499  Set of Brown ribbons, banners, badges and labe...          2   en   \n",
       "\n",
       "    lang_score  \n",
       "0          1.0  \n",
       "1          1.0  \n",
       "2         0.49  \n",
       "3          1.0  \n",
       "4          1.0  \n",
       "..         ...  \n",
       "495       0.35  \n",
       "496       0.96  \n",
       "497        1.0  \n",
       "498       0.59  \n",
       "499        1.0  \n",
       "\n",
       "[500 rows x 5 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor.df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c6824f1",
   "metadata": {},
   "source": [
    "## Google translate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "624123b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['text_translated']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 33%|███▎      | 9/27 [00:09<00:13,  1.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GoogleTranslateFilter] , retry: 0/2\n",
      "[GoogleTranslateFilter] , retry: 1/2\n",
      "[GoogleTranslateFilter] , retry: 2/2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 59%|█████▉    | 16/27 [00:41<00:15,  1.42s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GoogleTranslateFilter] , retry: 0/2\n",
      "[GoogleTranslateFilter] , retry: 1/2\n",
      "[GoogleTranslateFilter] , retry: 2/2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 67%|██████▋   | 18/27 [01:12<01:04,  7.16s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GoogleTranslateFilter] , retry: 0/2\n",
      "[GoogleTranslateFilter] , retry: 1/2\n",
      "[GoogleTranslateFilter] , retry: 2/2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 27/27 [01:47<00:00,  3.97s/it]\n"
     ]
    }
   ],
   "source": [
    "from DPF.filters.texts.google_translate_filter import GoogleTranslateFilter\n",
    "\n",
    "column_filter = GoogleTranslateFilter(\n",
    "    text_column_name='text',\n",
    "    max_symbols_in_batch=2000,\n",
    "    timeout_on_error=10,\n",
    "    num_retries_per_batch=2\n",
    ")\n",
    "print(column_filter.result_columns)\n",
    "\n",
    "processor.apply_column_filter(column_filter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fa2bd114",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_path</th>\n",
       "      <th>text</th>\n",
       "      <th>split_name</th>\n",
       "      <th>lang</th>\n",
       "      <th>lang_score</th>\n",
       "      <th>text_translated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>example_dataset/0.tar/0.jpg</td>\n",
       "      <td>шотландцы в национальной одежде с флагом. - ba...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Scots in national clothes with flag. - balmora...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>example_dataset/0.tar/1.jpg</td>\n",
       "      <td>королевская золотая корона с драгоценностями н...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "      <td>королевская золотая корона с драгоценностями н...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>example_dataset/0.tar/2.jpg</td>\n",
       "      <td>Пасха</td>\n",
       "      <td>0</td>\n",
       "      <td>be</td>\n",
       "      <td>0.49</td>\n",
       "      <td>Easter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>example_dataset/0.tar/3.jpg</td>\n",
       "      <td>округа нью-джерси карта печати - elizabeth sto...</td>\n",
       "      <td>0</td>\n",
       "      <td>uk</td>\n",
       "      <td>1.0</td>\n",
       "      <td>New Jersey counties map print - elizabeth stoc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>example_dataset/0.tar/4.jpg</td>\n",
       "      <td>английская золотая корона с драгоценностями, и...</td>\n",
       "      <td>0</td>\n",
       "      <td>ru</td>\n",
       "      <td>1.0</td>\n",
       "      <td>английская золотая корона с драгоценностями, и...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>495</th>\n",
       "      <td>example_dataset/2.tar/495.jpg</td>\n",
       "      <td>bokeh Background</td>\n",
       "      <td>2</td>\n",
       "      <td>de</td>\n",
       "      <td>0.35</td>\n",
       "      <td>bokeh background</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>example_dataset/2.tar/496.jpg</td>\n",
       "      <td>Healthcare, Medicine,</td>\n",
       "      <td>2</td>\n",
       "      <td>ro</td>\n",
       "      <td>0.96</td>\n",
       "      <td>Healthcare, Medicine,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>example_dataset/2.tar/497.jpg</td>\n",
       "      <td>A set of business teams icons that include edi...</td>\n",
       "      <td>2</td>\n",
       "      <td>en</td>\n",
       "      <td>1.0</td>\n",
       "      <td>A set of business teams icons that include edi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>example_dataset/2.tar/498.jpg</td>\n",
       "      <td>Vector floral pattern icon collection</td>\n",
       "      <td>2</td>\n",
       "      <td>es</td>\n",
       "      <td>0.59</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>example_dataset/2.tar/499.jpg</td>\n",
       "      <td>Set of Brown ribbons, banners, badges and labe...</td>\n",
       "      <td>2</td>\n",
       "      <td>en</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Set of Brown ribbons, banners, badges and labe...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        image_path  \\\n",
       "0      example_dataset/0.tar/0.jpg   \n",
       "1      example_dataset/0.tar/1.jpg   \n",
       "2      example_dataset/0.tar/2.jpg   \n",
       "3      example_dataset/0.tar/3.jpg   \n",
       "4      example_dataset/0.tar/4.jpg   \n",
       "..                             ...   \n",
       "495  example_dataset/2.tar/495.jpg   \n",
       "496  example_dataset/2.tar/496.jpg   \n",
       "497  example_dataset/2.tar/497.jpg   \n",
       "498  example_dataset/2.tar/498.jpg   \n",
       "499  example_dataset/2.tar/499.jpg   \n",
       "\n",
       "                                                  text split_name lang  \\\n",
       "0    шотландцы в национальной одежде с флагом. - ba...          0   ru   \n",
       "1    королевская золотая корона с драгоценностями н...          0   ru   \n",
       "2                                                Пасха          0   be   \n",
       "3    округа нью-джерси карта печати - elizabeth sto...          0   uk   \n",
       "4    английская золотая корона с драгоценностями, и...          0   ru   \n",
       "..                                                 ...        ...  ...   \n",
       "495                                   bokeh Background          2   de   \n",
       "496                              Healthcare, Medicine,          2   ro   \n",
       "497  A set of business teams icons that include edi...          2   en   \n",
       "498              Vector floral pattern icon collection          2   es   \n",
       "499  Set of Brown ribbons, banners, badges and labe...          2   en   \n",
       "\n",
       "    lang_score                                    text_translated  \n",
       "0          1.0  Scots in national clothes with flag. - balmora...  \n",
       "1          1.0  королевская золотая корона с драгоценностями н...  \n",
       "2         0.49                                             Easter  \n",
       "3          1.0  New Jersey counties map print - elizabeth stoc...  \n",
       "4          1.0  английская золотая корона с драгоценностями, и...  \n",
       "..         ...                                                ...  \n",
       "495       0.35                                   bokeh background  \n",
       "496       0.96                              Healthcare, Medicine,  \n",
       "497        1.0  A set of business teams icons that include edi...  \n",
       "498       0.59                                               None  \n",
       "499        1.0  Set of Brown ribbons, banners, badges and labe...  \n",
       "\n",
       "[500 rows x 6 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor.df[['text', 'text_translated']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0699bfd6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:.mlspace-dpf_llava]",
   "language": "python",
   "name": "conda-env-.mlspace-dpf_llava-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
